import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import multivariate_normal
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline
plt.rcParams['figure.dpi'] = 70

def sinusoidal(x):
    return np.sin(2 * np.pi * x)

def create_data(func, sample_size, std, domain=[0, 1]):
    x = np.linspace(*domain, sample_size)
    np.random.shuffle(x)
    t = func(x) + np.random.normal(scale=std, size=x.shape)
    return x, t

Given a training set $\mathcal{D} = \{(\mathbf{x}_i, y_i): \forall i \in N = \{1, \ldots, n\}\}$ where

  • each observation $\mathbf{x}_i \equiv (x_{i1},\ldots,x_{ip})^\intercal$ has $p$ attributes, and

  • target value vector $\mathbf{y} \equiv (y_1,\ldots,y_n)^\intercal$

We shall fit the data using a linear model

\begin{align} f(\mathbf{x}, \mathbf{w}) = w_0 + w_1 \phi_1(\mathbf{x}) + w_2 \phi_2(\mathbf{x}) + \cdots + w_{m-1} \phi_{m-1}(\mathbf{x}) \nonumber \end{align}

where $\phi_i$ is a transformation of $\mathbf{x}$ called basis functions. We assume that the target value $y$ is given by a deterministic function $f(\mathbf{x}, \mathbf{w})$ with additive Gaussian noise $\epsilon \sim \mathcal{N}(0,\sigma^2)$ so that

\begin{align} y = f(\mathbf{x}, \mathbf{w}) + \epsilon \nonumber \end{align}

In the matrix form, we have \begin{align*} f(\mathbf{x}, \mathbf{w}) &= \mathbf{\Phi} \mathbf{w} \\ \end{align*} where \begin{align*} \mathbf{w} &= (w_0, \ldots, w_{m-1}) \end{align*} and \begin{align*} \mathbf{\Phi} &= \left\{ \begin{array}{l} \phi(\mathbf{x}_1)\\ \phi(\mathbf{x}_2) \\ \vdots \\ \phi(\mathbf{x}_n) \end{array} \right\} = \left\{ \begin{array}{lllll} \phi_0(\mathbf{x}_1) = 1 & \phi_1(\mathbf{x}_1) & \phi_2(\mathbf{x}_1) & \ldots & \phi_{m-1}(\mathbf{x}_1) \\ \phi_0(\mathbf{x}_2) = 1& \phi_1(\mathbf{x}_2) & \phi_2(\mathbf{x}_2) & \ldots & \phi_{m-1}(\mathbf{x}_2) \\ \vdots & \vdots & \vdots & \ddots& \vdots\\ \phi_0(\mathbf{x}_n) = 1& \phi_1(\mathbf{x}_n) & \phi_2(\mathbf{x}_n) & \ldots & \phi_{m-1}(\mathbf{x}_n) \end{array} \right\}_{n \times m} \\ \end{align*}

  • When $\phi_j(\mathbf{x}) = x_j$, we have linear regression with input matrix \begin{align} \mathbf{\Phi} = \mathbf{X}_{n \times p} = (\mathbf{x}_1, \ldots, \mathbf{x}_n)^\intercal. \nonumber \end{align}

  • When $p=1$ and $\phi_j(x) = x^j$, we have polynomial fitting.

Note that the matrix $\mathbf{X}$ should be of size $n \times (p+1)$ because of regression constant. For the purpose of simplifying notations, we assume $x_{i1} = 1$ $\forall i \in N$ and hence ignore the notational inconvenience of dimension $p+1$.

In a frequentist setting,

  • $\mathbf{w}$ is considered to be a fixed parameter, whose value is determined by some form of "estimator", and
  • error on this estimate are obtained by considering the distribution of possible observed data sets $\mathcal{D}$.

In a Bayesian setting,

  • We assume a prior probability distribution $p(\mathbf{w})$ before observing the data.
  • The effect of the observed data $\mathcal{D}$ is expressed through $p(\mathcal{D}|\mathbf{w})$, i.e., likelihood function.
  • Bayes' theorem \begin{align*} p(\mathbf{w}|\mathcal{D}) &= \frac{p(\mathcal{D}|\mathbf{w}) p(\mathbf{w})}{p(\mathcal{D})},\ \text{i.e., posterior} \propto \text{likelihood $\times$ prior} \end{align*}

evaluates the uncertainty in $\mathbf{w}$ after $\mathcal{D}$ is observed, where

  • $p(\mathbf{w}|\mathcal{D})$ is the posterior probability after $\mathcal{D}$ is observed,
  • $p(\mathcal{D})$ is the probability of evidence.

Least Squares

We minimize residual sum of squares (RSS) \begin{align*} \text{RSS}(\mathbf{w}) = \sum_{i=1}^{n} (y_i - f(\mathbf{x}_i, \mathbf{w}))^2 = (\mathbf{y} - \mathbf{\Phi} \mathbf{w})^\intercal(\mathbf{y} - \mathbf{\Phi}\mathbf{w}) \end{align*}

The first order condition gives \begin{align*} \mathbf{\Phi}^\intercal (\mathbf{y} - \mathbf{\Phi}\mathbf{w}) = 0 \end{align*}

and solving for $\mathbf{w}$ leads to the normal equation \begin{align*} \mathbf{\hat{w}} = (\mathbf{\Phi}^\intercal\mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{y} = \mathbf{\Phi}^\dagger \mathbf{y} \end{align*}

where $\mathbf{\Phi}^\dagger$ is the Moore-Penrose pseudo-inverse of the matrix $\mathbf{\Phi}$. We have

Theorem

$E(\mathbf{\hat{w}}) = \mathbf{w}$ and $\text{Var}(\mathbf{\hat{w}}) = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \sigma^2$.

Proof

Note that $y_i$ are uncorrelated with constant variance $\sigma^2$, and $\mathbf{x}_i$ are fixed.

$\mathbf{\hat{w}}$ is an unbiased estimate of $\mathbf{w}$:

\begin{align*} \text{E}(\mathbf{\hat{w}}) & = \text{E}((\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{y}) = \text{E}((\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{\Phi}\mathbf{w}) = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{\Phi} \text{E}(\mathbf{w}) = \mathbf{w} \end{align*}

Because

\begin{align*} \mathbf{\hat{w}} - \text{E}(\mathbf{\hat{w}}) & = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{y} - (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{\Phi} \mathbf{w} = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal (\mathbf{y} - \mathbf{\Phi} \mathbf{w}) \\ &\equiv (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \epsilon \\ \text{E}(\epsilon \epsilon^\intercal) &= \text{Var}(\epsilon) = \sigma^2 \mathbf{I}_{n}, \end{align*}

the variance is

\begin{align*} \text{Var}(\mathbf{\hat{w}}) &= \text{E}((\mathbf{\hat{w}} - \text{E}(\mathbf{\hat{w}}))(\mathbf{\hat{w}} - \text{E}(\mathbf{\hat{w}}))^\intercal) = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \text{E}(\epsilon \epsilon^\intercal) \mathbf{\Phi} (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \\ &= (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \mathbf{\Phi}^\intercal \mathbf{\Phi} (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \sigma^2 = (\mathbf{\Phi}^\intercal \mathbf{\Phi})^{-1} \sigma^2 \end{align*}


Theorem

The estimated variance \begin{align*} \hat{\sigma}^2 = \frac{1}{n-p} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2, \end{align*} where $\hat{y}_i = \mathbf{X} \mathbf{\hat{w}} + \epsilon$, is an unbiased estimate of the variance $\sigma^2$.

Proof

We need to show that $\textrm{E}(\hat{\sigma}^2) = \textrm{E}(\sigma^2)$. The vector of residual \begin{align*} \hat{\epsilon} = \mathbf{y} - \mathbf{\hat{y}} = \mathbf{y} - \mathbf{X} \mathbf{\hat{w}} = (\mathbf{I} - \mathbf{X} (\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{X}^\intercal) \mathbf{y} \equiv Q \mathbf{y} = Q (\mathbf{X} \mathbf{w} + \epsilon) = Q \epsilon \end{align*}

Note that $\textrm{tr}(Q) = \textrm{tr}(I) - \textrm{tr}(\mathbf{X} (\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{X}^\intercal) = n - \textrm{tr}((\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{X}^\intercal \mathbf{X}) =n - \textrm{tr}(I_{p \times p}) = n - p$. Now, we are ready to show the estimator is unbiased.

We have $(n-p)\textrm{E}(\hat{\sigma}^2) = \textrm{E}(\hat{\epsilon}^\intercal \hat{\epsilon}) = \textrm{E}(\epsilon^\intercal Q \epsilon)$. Since $\epsilon^\intercal Q \epsilon$ is $1 \times 1$ matrix, so it is equal its trace. \begin{align*} \textrm{E}(\epsilon^\intercal Q \epsilon) &= \textrm{E}(\textrm{tr}(\epsilon^\intercal Q \epsilon)) = \textrm{E}(\textrm{tr}(Q \epsilon \epsilon^\intercal)) = \textrm{tr}(Q\textrm{E}(\epsilon \epsilon^\intercal))\\ & = \textrm{tr}(Q\textrm{E}(\sigma^2 \mathbf{I})) = \sigma^2 \textrm{tr}(Q) = \sigma^2 (n-p) \end{align*}

We can show further that \begin{align*} \frac{(n-p)\hat{\sigma}^2}{\sigma^2} &\sim \chi^2_{n-p} \end{align*} which also implies $\textrm{E}(\hat{\sigma}^2) = \textrm{E}(\sigma^2)$.

It can be checked that $Q^\intercal=Q=Q^2$. Since $Q^2 - Q=0$, the minimal polynomial of $Q$ divides the polynomial $z^2 - z$. So, the eigenvalues of $Q$ are among 0 and 1. Since $\textrm{tr}(Q) = n - p$ is also the sum of the eigenvalues multiplied by their multiplicity, we necessarily have that 1 is an eigenvalue with multiplicity $n - p$ and zero is an eigenvalue with multiplicity $p$.

Since $Q$ is real and symmetric, there exists an orthogonal matrix $V$ with \begin{align*} V^\intercal QV = \Delta = \textrm{diag}(\underbrace{1, \ldots, 1}_{n-p \textrm{ times}}, \underbrace{0, \ldots, 0}_{p \textrm{ times}}) \end{align*}

Since $\epsilon \sim N(0, \sigma^2 \mathbf{I})$, we have $\hat{\epsilon} \sim N(0, \sigma^2 Q)$. Let $K \equiv V^\intercal \hat{\epsilon}$. Then $K \sim N(0, \sigma^2 \Delta)$ with $K_{n-p+1}=\ldots=K_n=0$ and

\begin{align*} \frac{\|K\|^2}{\sigma^2} &= \frac{\|K^{\star}\|^2}{\sigma^2} \sim \chi^2_{n-p} \end{align*}

with $K^{\star} = (K_1, \ldots, K_{n-p})^\intercal$. Since $V$ is orthogonal,

\begin{align*} \|\hat{\epsilon}\|^2 &= \|K\|^2=\|K^{\star}\|^2 \end{align*}

Thus, \begin{align*} \frac{(n-p)\hat{\sigma}^2}{\sigma^2} &\sim \chi^2_{n-p} \end{align*}


We use least squares to fit 10 input data points with different basis functions.

x_train, t_train = create_data(sinusoidal, 10, 0.25)

x_test = np.linspace(0, 1, 100)
t_test = sinusoidal(x_test)

basis_funcs = {'Polynomial': [8], 'Gaussian': [np.linspace(0, 1, 8), 0.1], 'Sigmoidal': [np.linspace(0, 1, 8), 10]}

fig = plt.figure(figsize=(18, 5))

for i, (key, value) in enumerate(basis_funcs.items()):
    
    phi_train = globals()[key](*value).dm(x_train)
    phi_test  = globals()[key](*value).dm(x_test)
    
    t, t_std = LeastSquares().fit(phi_train, t_train).predict_dist(phi_test)
    
    plt.subplot(1, 3, i+1)
    plt.scatter(x_train, t_train, facecolor="none", edgecolor="b", s=50, label="training data")
    plt.plot(x_test, t_test, label="$\sin(2\pi x)$")
    plt.plot(x_test, t, label="prediction")
    plt.fill_between(x_test, t - t_std, t + t_std, color="orange", alpha=0.5, label="std.")
    plt.title("{}".format(key))
    plt.legend()
    
plt.show()

The case with multiple outputs (so far, we only consider $y$ is a single-dimensional variable for each observation) is a natural generalization.

Geometrical Interpretation

The least-squares solution is obtained by finding the orthogonal projection of the data vector $\mathbf{y}$ onto the subspace spanned by the basis $\phi_j(\mathbf{x})$.

The regression can be performed by successive orthogonalization. The algorithm is known as Gram-Schmidt orthogonalization procedure, which is equivalent to the QR decomposition of $\mathbf{X}$. If inputs are orthogonal, the regression process is independent for each feature.

Stochastic Gradient Descent

  • When the training data set is very large or received in a stream, a direct solution using the normal equations of the least squares may not be possible.

  • An alternative approach is the \textit{stochastic gradient descent} algorithm.

  • The total error function

\begin{align} E_{\mathcal{D}}(\mathbf{w}) & = \frac{1}{2} \sum_{i=1}^{n} (y_i - \mathbf{w}^\intercal \phi(x_{i}))^2 \equiv \sum_{i=1}^{n} E_{i}(\mathbf{w}) \nonumber \end{align}
  • In general, the stochastic gradient descent algorithm is applying

    \begin{align} \mathbf{w}^{\tau + 1} = \mathbf{w}^\tau - \eta \bigtriangledown_{\mathbf{w}} E_i \nonumber \end{align}

    where ${\tau}$ is the iteration number and $\eta$ is a learning rate parameter.

  • When the error function is the sum-of-squares function (the order of evaluation does not change the result), then the algorithm is

\begin{align} \mathbf{w}^{\tau + 1} = \mathbf{w}^\tau + \eta \left(y_i - \mathbf{w}^{(\tau) \intercal}\phi_i\right)\phi_i \nonumber \end{align}

Data is generated by a trigonomietric function

\begin{align} t = w_0 + w_1 \sin(x) + w_2 \cos(x) \nonumber \end{align}

There are two reasons why we are not satisfied with the least squares estimates:

  • Interpretation:

    Instead of having a large number of predictors, we like to determine a smaller subset that exhibit the strongest effects.

  • Prediction accuracy:

    The least squares estimates often have low bias but large variance (see Bias-Variance Decomposition). Prediction accuracy can sometimes be improved by shrinking or setting some coefficients to zero. The shrinking methods impose some constraints on the coefficients. The constraints may add bias to the model, but the resulting model will have lower variance.

def trigonometric(x, w=[1.85, 0.57, 4.37]): 
    return np.dot(w, [np.ones_like(x), np.sin(x), np.cos(x)])

x_trig_train, t_trig_train = create_data(trigonometric, 20, 1, [0, 4.0*np.pi])

x_trig_test = np.linspace(0, 4.0*np.pi, 100)
t_trig_test = trigonometric(x_trig_test)

# design matrix
phi = np.array([np.ones_like(x_trig_train), np.sin(x_trig_train), np.cos(x_trig_train)]).transpose()

# we set initial w = 0
w = np.zeros(phi.shape[1])

eta = 0.25
# since we have 20 samples, we use gradient from each sample in order
for i in range(20):
    w = w + eta * (t_trig_train[i] - np.dot(w.T, phi[i])) * phi[i]

t_predicted = trigonometric(x_trig_test, w)

plt.scatter(x_trig_train, t_trig_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_trig_test, t_trig_test, c="g", label="trigonometric")
plt.plot(x_trig_test,t_predicted, c='r', label="fitting")
plt.legend(bbox_to_anchor=(1.05, 0.27), loc=2, borderaxespad=0.5)
plt.show()

Subset Selection

We only consider a subset of the independent variables, and eliminate the rest from the model. Least squares regression is used to estimate the coefficients of the inputs that are retained.

Best subset regression finds for each subset size $k$ that gives smallest residual sum of squares $\text{RSS}(\mathbf{w})$.

Forward- and Backward-Stepwise Selection are greedy algorithms to find a subset of variables by producing a nested sequence of models.

  • Forward stepwise selection starts with the intercept, and then sequentially adds into the model the predictor that most improves the fit in terms of $\text{RSS}(\mathbf{w})$.

  • Backward-stepwise selection starts with the full model, and sequentially deletes the predictor that has the least impact on the fit. The candidate for dropping is the variable with the smallest Z-score.

Forward-Stagwise Selection is even more constrained than forward stepwise regression as follows

  • Choose $\mathbf{x}_j$ with highest current correlation $c_j = \mathbf{x}_j^\intercal(\mathbf{y} - \mathbf{\hat{y}})$

  • Take a small step $0 < \alpha < |c_j|$ in the direction of selected $\mathbf{x}_j$

  • $\hat{y} \leftarrow \hat{y} + \alpha \cdot \text{sign}(c_j) \cdot \mathbf{x}_j$

By retaining a subset of the predictors and discarding the rest, subset selection produces a model that is interpretable and has possibly lower prediction error than the full model. However, because it is a discrete process that variables are either retained or discarded. It often exhibits high variance, and sometimes doesn't reduce the prediction error of the full model.

Shrinkage Methods

Shrinkage methods shrinks the regression coefficients by imposing a penalty on their size. It doesn't suffer as much from high variability, comparing to the subset selection method, because of its continuous process.

\begin{align*} E_{\mathcal{D}}(\mathbf{w}) + \lambda E_{\mathbf{w}}(\mathbf{w}) & = \frac{1}{2} \sum_{i=1}^{n} (y_i - \mathbf{w}^\intercal \phi(x_{i}))^2 + \frac{\lambda}{2} \sum_{i=0}^{m} \lvert w_i \rvert^q \\ & = \frac{1}{2} \left( \mathbf{y}^\intercal \mathbf{y} - 2 (\mathbf{\Phi} \mathbf{w})^\intercal \mathbf{y} + (\mathbf{\Phi} \mathbf{w})^\intercal \mathbf{\Phi} \mathbf{w} \right) + \frac{\lambda}{2} \lVert \mathbf{w} \rVert_q \end{align*}

It is equivalent to minimize the unregularized sum-of-squares error subject to the constraint

\begin{align*} \sum_{i=0}^{n} \lvert w_i \rvert^q \le \eta. \end{align*}

Examples:

  • $q=0$ is variable subset selection,

  • $q=1$ is know as the lasso (Least Absolute Shrinkage and Selection Operator), and

  • $q=2$ corresponds to ridge regression. We have

\begin{align*} E_{\mathcal{D}}(\mathbf{w}) + \lambda E_{\mathbf{w}}(\mathbf{w}) & \sim (\mathbf{\Phi} \mathbf{w})^\intercal \mathbf{\Phi} \mathbf{w} - 2 (\mathbf{\Phi} \mathbf{w})^\intercal \mathbf{y} + \mathbf{y}^\intercal \mathbf{y} + \lambda (\mathbf{I} \mathbf{w})^\intercal \mathbf{w} \\ \frac{d (E_{\mathcal{D}}(\mathbf{w}) + E_{\mathbf{w}}(\mathbf{w}) )}{d \mathbf{w}} & = 2\mathbf{\Phi}^\intercal \mathbf{\Phi} \mathbf{w} - 2\mathbf{\Phi}^\intercal \mathbf{y} + 2\lambda \mathbf{I} \mathbf{w} = 0 \\ \mathbf{\Phi}^\intercal \mathbf{y} &= \left( \mathbf{\Phi}^\intercal \mathbf{\Phi} + \lambda \mathbf{I} \right) \mathbf{w} \\ \mathbf{w} &= \left(\mathbf{\Phi}^\intercal \mathbf{\Phi} + \lambda \mathbf{I} \right)^{-1} \mathbf{\Phi}^\intercal \mathbf{y} \end{align*}

One might consider estimating $q$ from data, however it is not worth the effort for the extra variance incurred. We use ridge regression with three different values for a polynomial fitting.

x_train, t_train = create_data(sinusoidal, 10, 0.25)

x_test = np.linspace(0, 1, 100)
t_test = sinusoidal(x_test)

# set degree = 9 because it's the most complicated polynomial model for 10 data points
phi_train = Polynomial(9).dm(x_train)
phi_test  = Polynomial(9).dm(x_test)

fig = plt.figure(figsize=(15, 4))
for i, alpha in enumerate([0, 1e-3, 1]):
    
    # alpha is the regularization coefficient
    t = LeastSquares(alpha).fit(phi_train, t_train).predict(phi_test)
   
    plt.subplot(1, 3, i+1)
    plt.scatter(x_train, t_train, facecolor="none", edgecolor="b", s=50, label="training data")
    plt.plot(x_test, t_test, c="g", label="$\sin(2\pi x)$")
    plt.plot(x_test, t, c="r", label="fitting")
    plt.ylim(-1.5, 1.5)
    plt.legend(loc=1)
    plt.annotate(r"n={}, $\lambda$={}".format(9, alpha), xy=(0.05, -1.2))
plt.show()

Singular Value Decomposition (SVD)

The singular value decomposition (SVD) gives us some insight into the nature of ridge regression. The SVD of $\mathbf{X}_{n \times p} = (\mathbf{x}_1, \ldots, \mathbf{x}_n)^\intercal$ has the form $\mathbf{X} = U D V^\intercal$, where

  • $U_{n \times p}$ is orthogonal and the columns are spanning the column space of $\mathbf{X}$
  • $V_{p \times p}$ is orthogonal and the columns are spanning the row space of $\mathbf{X}$
  • $D_{p \times p}$ is diagonal with $d_1 \ge d_2 \ge \cdots \ge d_p \ge 0$.

We have

\begin{align*} \mathbf{X} \mathbf{w}^{\text{ls}} = UU^\intercal \mathbf{y} = \sum_{j=1}^{p} \mathbf{u}_j (\mathbf{u}^\intercal_j \mathbf{y}) \end{align*}

and

\begin{align*} \mathbf{X} \mathbf{w}^{\text{ridge}} = \sum_{j=1}^{p} \mathbf{u}_j \left(\frac{d^2_j}{d^2_j + \lambda} (\mathbf{u}^\intercal_j \mathbf{y}) \right) \end{align*}

where

  • $\mathbf{u}_j$ is a normalized principle component,
  • Ridge shrinks more with smaller $d^2_j$ (smaller variance, see the figure below),
  • Ridge improves interpretability by putting low weights on small variance components.

Techinical Details

Note that $\mathbf{X}^\intercal \mathbf{X} = VDU^\intercal UDV^\intercal = VD^2V^\intercal$. Thus

\begin{align*} \mathbf{X} \mathbf{w}^{\text{ls}} &= UDV^\intercal (VD^2V^\intercal)^{-1} VDU^\intercal \mathbf{y} = UDV^\intercal (V^{-\intercal}D^{-2}V^{-1}) VDU^\intercal \mathbf{y} \\ &= UU^\intercal \mathbf{y} = \sum_{j=1}^{p} \mathbf{u}_j (\mathbf{u}^\intercal_j \mathbf{y}) \\ \mathbf{w}^{\text{ridge}} &= (\mathbf{X}^\intercal \mathbf{X} + \lambda \mathbf{I})^{-1} \mathbf{X}^\intercal \mathbf{y} = (VD^2V^\intercal + \lambda VV^\intercal)^{-1}VDU^\intercal\mathbf{y} \\ &= (V(D^2+\lambda \mathbf{I})V^\intercal)^{-1}VDU^\intercal\mathbf{y} \\ &= V^{-\intercal}(D^2+\lambda \mathbf{I})^{-1}DU^\intercal\mathbf{y} \\ \mathbf{X} \mathbf{w}^{\text{ridge}} &=UD(D^2+\lambda \mathbf{I})^{-1}DU^\intercal\mathbf{y} \end{align*}

The matrix $D(D^2+\lambda \mathbf{I})^{-1}D$ is diagonal with elements given by

\begin{align*} \frac{d^2_j}{d^2_j + \lambda} \end{align*}

Thus,

\begin{align*} \mathbf{X} \mathbf{w}^{\text{ridge}} &= \sum_{j=1}^{p} \mathbf{u}_j \left(\frac{d^2_j}{d^2_j + \lambda} (\mathbf{u}^\intercal_j \mathbf{y}) \right) \end{align*}


Bias-Variance Decomposition

We seek a function $f(\mathbf{x})$ for predicting $y$ given the input $\mathbf{x}$. The decision theory minimizes a loss function for penalizing errors in prediction

\begin{align*} \mathbb{E} (L) & = \int \int \left( f(\mathbf{x}) - y \right)^2 p(\mathbf{x}, y) d\mathbf{x} dy \end{align*}

By conditioning on $\mathbf{x}$, we have

\begin{align*} \mathbb{E} (L) & = \mathbb{E} _{\mathbf{x}} \mathbb{E} _{y|\mathbf{x}} \left[ \left( f(\mathbf{x}) - y \right)^2 | \mathbf{x} \right] \end{align*}

It suffices to minimize the expected loss pointwise

\begin{align*} \text{argmin}_{\alpha} \mathbb{E} _{y|\mathbf{x}} \left[ \left(y - \alpha \right)^2 | \mathbf{x} = \mathbf{x}_0 \right] \end{align*}

The solution is \begin{align*} h(\mathbf{x}_0) = \mathbb{E} (y \lvert \mathbf{x} = \mathbf{x}_0 ) \end{align*}

where $h(\mathbf{x})$ is the conditional expectation, also known as the regression function, and

\begin{align} h(\mathbf{x}) = \mathbb{E} (y \lvert \mathbf{x} ) = \int y p(y \lvert \mathbf{x}) dt \nonumber \end{align}

It is worth to mention that the above squared loss function $\mathbb{E} (L)$ arising from decision theory. It is different from the sum-of-squares error function that arose in the maximum likelihood estimation of model parameters, because we might use more sophisticated techniques than least squares, for example regularization or a fully Bayesian approach, to determine the conditional distribution $p(y \lvert \mathbf{x})$.

Bias-Variance Decomposition

For a particular data set $\mathcal D$, $f(\mathbf{x})$ takes the form $f(\mathbf{x}; \mathcal D)$.

\begin{align*} \mathbb{E} (L) = \underbrace{\mathbb{E}_{\mathcal D}\left[ \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right] \right)^2 \right]}_{\text{variance}} + \underbrace{\left( \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)^2}_{\text{bias$^2$}} + \underbrace{\int \text{Var}\left[y|\mathbf{x}\right] p(\mathbf{x}) d\mathbf{x}}_{\text{noise}} \end{align*}

or

\begin{equation*} \text{expected loss = variance (of prediction) + bias$^2$ (of estimate) + noise (of data)} \label{bias_var_decomp} \end{equation*}
Proof
\begin{align*} \mathbb{E} (L) & = \int \int \left( f(\mathbf{x}) - y\right)^2 p(\mathbf{x}, y) d\mathbf{x} dy \\ & = \int \int \left( f(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] + \mathbb{E} \left[ y | \mathbf{x}\right] - y \right)^2 p(\mathbf{x}, t) d\mathbf{x} dy \\ & = \int \int \left( ( f(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] )^2 + \underbrace{2 ( f(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] )(\mathbb{E} \left[ y | \mathbf{x}\right] -y)}_{\text{vanishes}} + (\mathbb{E} \left[ y | \mathbf{x}\right] - y )^2\right) p(\mathbf{x}, y) d\mathbf{x} dy \\ & = \int \left( f(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] \right)^2 p(\mathbf{x}) d\mathbf{x} + \int \left( \mathbb{E} \left[ y | \mathbf{x}\right] -y \right)^2 p(\mathbf{x},y) d\mathbf{x} dy \\ & = \int \left( f(\mathbf{x}) - h(\mathbf{x}) \right)^2 p(\mathbf{x}) d\mathbf{x} + \underbrace{\int \text{Var}\left[y|\mathbf{x}\right] p(\mathbf{x}) d\mathbf{x}}_{\text{noise}} \end{align*}

For any given data set $\mathcal{D}$, learning algorithm obtains a prediction function $y(\mathbf{x}; \mathcal{D})$.

The term is vanished because

\begin{align*} \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) \mathbb{E} \left[ y | \mathbf{x}\right] p(\mathbf{x},y) d\mathbf{x} dy &= \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) \mathbb{E} \left[ y | \mathbf{x}\right] p(\mathbf{x}) p(y|\mathbf{x})d\mathbf{x} dy \\ & = \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) \mathbb{E} \left[ y | \mathbf{x}\right] p(\mathbf{x}) (\int p(y|\mathbf{x}) dy)d\mathbf{x} \\ & =\int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) \mathbb{E} \left[ y | \mathbf{x}\right] p(\mathbf{x}) d\mathbf{x} \\ \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) y p(\mathbf{x},y) d\mathbf{x} dy &= \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) y p(\mathbf{x}) p(y|\mathbf{x}) d\mathbf{x} dy \\ & = \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) p(\mathbf{x}) (\int y p(y|\mathbf{x}) dy ) d\mathbf{x} \\ & = \int ( y(\mathbf{x}) - \mathbb{E} \left[ y | \mathbf{x}\right] ) \mathbb{E} \left[ y | \mathbf{x}\right] p(\mathbf{x}) d\mathbf{x} \end{align*}

Now, we consider \begin{align*} \int \left( f(\mathbf{x}) - h(\mathbf{x}) \right)^2 p(\mathbf{x}) d\mathbf{x} \end{align*}

We have \begin{align*} \left( f(\mathbf{x}; \mathcal D) - h(\mathbf{x}) \right)^2 =& \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right] + \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)^2 \\ &= \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right] \right)^2+ \left(\mathbb{E}_{\mathcal D} \left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)^2 \\ &\hspace{1cm} + \underbrace{2 \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] \right) \left( \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)}_{\text{vanishes after taking $\mathbb{E}_{\mathcal D}$}} \end{align*}

We get mean squared error (MSE)

\begin{align*} \mathbb{E}_{\mathcal D}\left[\left( f(\mathbf{x}; \mathcal D) - h(\mathbf{x}) \right)^2 \right] = \underbrace{\mathbb{E}_{\mathcal D}\left[ \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right] \right)^2 \right]}_{\text{variance}} + \underbrace{\left( \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)^2}_{\text{bias$^2$}} \end{align*}


Bias-Variance Decomposition provide insights on model complexity from a frequentist perspective.

  • Noise cannot be reduced because it exists in the true data generation process.
  • Bias-variance trade-off shows that very flexible models has low bias and high variance, but relatively rigid models has high bias and low variance.
  • For example, least squares estimates $\mathbf{w}$ has the smallest variance among all linear unbiased estimates (Gauss-Markov Theorem). However, biased estimates (e.g., ridge regression) may have smaller variance.

The bias-variance trade-off is demonstrated by $k$-nearest-neighbor regression and linear regression. Consider $y = f(\mathbf{x}) + \epsilon$ where $\mathbb{E}(\epsilon) = 0$ and $\text{Var}(\epsilon) = \sigma^2$.

$k$-nearest-neighbor regression

For $k$-nearest-neighbor regression fit, we have

\begin{align*} \hat{f}(\mathbf{x}) = \text{Avg} (y_i | \mathbf{x}_i \in N_{k} (\mathbf{x})), \end{align*}

where $N_{k} (\mathbf{x})$ is the neighborhood containing the $k$ points in the training dataset closest to $\mathbf{x}$, the expected loss has the form

\begin{align*} \mathbb{E}(L_{\mathbf{x}_0}) &= \mathbb{E} \left[ (y - \hat{f}(\mathbf{x}_0))^2| \mathbf{x} = \mathbf{x}_0 \right]\\ & = \sigma^2 + \left[ f(\mathbf{x}_0) - \frac{1}{k} \sum_{\ell = 1}^{k} f(\mathbf{x}_{(\ell)})\right]^2 + \frac{\sigma^2}{k} \\ & = \text{noise (of data) + bias$^2$ (of estimate) + variance (of prediction) } \end{align*}
linear regression

For a linear regression model, we have

\begin{align*} \hat{f}_p(\mathbf{x}) = \mathbf{x}^\intercal \beta, \end{align*}

where $\beta$ has $p$ components, we have

\begin{align*} \mathbb{E}(L_{\mathbf{x}_0}) &= \mathbb{E} \left[ (y - \hat{f}(\mathbf{x}_0))^2| \mathbf{x} = \mathbf{x}_0 \right]\\ & = \sigma^2 + \left[ f(\mathbf{x}_0) - \mathbb{E} \hat{f}_p(\mathbf{x}_0) \right]^2 + \|\mathbf{X}(\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{x}_0 \|^2\sigma^2 \\ & = \text{noise (of data) + bias$^2$ (of estimate) + variance (of prediction) } \end{align*}

Note the fit

\begin{align*} \hat{f}_p(\mathbf{x}_0) = \mathbf{x}^\intercal_0 \beta = \mathbf{x}^\intercal_0 (\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{X}^\intercal y, \end{align*}

and hence

\begin{align*} \text{Var}(\hat{f}_p(\mathbf{x}_0)) = \|\mathbf{X}(\mathbf{X}^\intercal \mathbf{X})^{-1} \mathbf{x}_0 \|^2\sigma^2 \end{align*}


The bias-variance trade-off plays an important role in frequentist view. However, the insights on model complexity is of limited practical value, because

  • the bias-variance decomposition is based on averages, $\mathbb{E}_{\mathcal D}$, with respect to several data sets, whereas in practice we have only the single observed data set.
  • if we had a large number of independent training sets, we would be better off combining them, which of course would reduce the level of over-fitting for a given model complexity.

We illustrate the bias and variance with a regularization parameter $\ln \lambda \in \{2.6, -0.31, -2.4\}$.

  • We generate 100 datasets in x_train, t_train, and each has 25 sinusoidal data points

  • Graphs in the first row demonstrate the variance

\begin{align*} \mathbb{E}_{\mathcal D}\left[ \left( f(\mathbf{x}; \mathcal D) - \mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right] \right)^2 \right] \end{align*}

where each curve is represented by a function $f(\mathbf{x}; \mathcal D)$ with given dataset $\mathcal D$. When $\mathcal D$ is given, $f(\mathbf{x}; \mathcal D)$ is a fitting function based on the dataset (for clarity, only 20 of the 100 fits are shown). Their average $\mathbb{E}_{\mathcal D}\left[ f(\mathbf{x}; \mathcal D)\right]$ gives the fitting curve of graphs in the second row.

  • Thus, graphs in the second row demonstrate the bias$^2$
\begin{align*} \left( \mathbb{E}_{\mathcal D}\left[f(\mathbf{x}; \mathcal D)\right] - h(\mathbf{x}) \right)^2 \end{align*}

where $h(\mathbf{x})$ the optimal prediction, i.e., $\sin (2 \pi x)$ in our case.

The last graph shows variance and bias$^2$ by varying $\lambda$.

funcs = {'Polynomial': [24], 'Gaussian': [np.linspace(0, 1, 24), 0.1], 'Sigmoidal': [np.linspace(0, 1, 24), 10]}

x_test = np.linspace(0, 1, 100)
t_test = sinusoidal(x_test)

def bias_and_var(fname):
    phi_test = globals()[fname](*funcs[fname]).dm(x_test)
    plt.figure(figsize=(12, 6))
    
    for loc, alpha in enumerate([2.6, -0.31, -2.4]):
        plt.subplot(2, 3, loc+1)
        # We have 100 data sets, each having 25 data points
        x_train, t_train = zip(*[create_data(sinusoidal, 25, 0.8) for i in range(100)])
        phi_train = [globals()[fname](*funcs[fname]).dm(x) for x in x_train]
        
        # t[i] is an array contains the prediction values for x_test and 
        # the fitting function is derived based on i-th data set
        t = [LeastSquares(np.exp(alpha)).fit(phi, t).predict(phi_test) for phi, t in zip(phi_train, t_train)]
        # For clarity, only 20 of the 100 fits are shown
        for i in range(20):
            plt.plot(x_test, t[i], c="orange")
            
        plt.annotate(r"$\ln \lambda={}$".format(alpha), xy=(0.05, -1.2))        
        plt.ylim(-1.5, 1.5)

        plt.subplot(2, 3, loc+4)
        plt.plot(x_test, t_test, c="g", label="$\sin(2\pi x)$")
        plt.plot(x_test, np.asarray(t).mean(axis=0), c="r", label="fitting")
        plt.ylim(-1.5, 1.5)
        plt.legend()
    plt.show()
    
    plt.figure(figsize=(8, 6))
    # 
    x_train, t_train = zip(*[create_data(sinusoidal, 25, 0.8) for i in range(100)])
    phi_train = [globals()[fname](*funcs[fname]).dm(x) for x in x_train]
    alpha_range = np.linspace(-3.6, 1.5, 30)
    t = [[LeastSquares(np.exp(alpha)).fit(phi, t).predict(phi_test) for phi, t in zip(phi_train, t_train)]
         for alpha in alpha_range]
    
    bias2 = ((np.asarray(t).mean(axis=1) - t_test) ** 2).mean(axis=1) #(3.46)
    var = np.array(t).var(axis=1).mean(axis=1)                        #(3.47)
    
    plt.ylim(0, (bias2+var).max()+.001)
    plt.plot(alpha_range, bias2, label=r"bias$^2$")
    plt.plot(alpha_range, var, label="var")
    plt.plot(alpha_range, bias2+var, label=r"bias$2$+var")
    plt.xlabel('$\ln \lambda$')
    
    plt.legend()
    plt.show()
    
bias_and_var('Gaussian')    
# widget_layout = widgets.Layout(width='250px', height='30px')
# interact(bias_and_var,
#          fname = widgets.Dropdown(description='Basis Functions:',
#                                    style = {'description_width': 'initial'},
#                                    options=['Polynomial','Gaussian','Sigmoidal'],
#                                    value='Gaussian',layout=widget_layout));    

Principal Components Regression

In many situations we have a large number of inputs, often very correlated. We create a small number of new variables which are linear combinations of the original inputs. Then those new variables are used in place of the original inputs in the regression.

Principal Components Regression (PCR) forms the derived input columns (principle components) and then regresses $y$ on a subset of those components.

The main purposes of a principal component analysis are the analysis of data to identify patterns and finding patterns to reduce the dimensions of the dataset with minimal loss of information.

In the next example, we start with a 3-dimensional dataset, and then we reduce to an 2-dimensional dataset by dropping 1 dimension.

We generate 40 3-dimensional samples randomly drawn from a multivariate Gaussian distribution. Assume that the samples stem from two different classes, where one half (i.e., 20) samples of our data set are labeled $\mathcal{C}_1$ (class 1) and the other half $\mathcal{C}_2$ (class 2) with mean vectors

\begin{align*} \mu_1 = \mu_2 = \begin{bmatrix}0\\0\\0\end{bmatrix} \end{align*}

and covariance matrices \begin{align*} \Sigma_1 = \Sigma_2 = \begin{bmatrix}1\quad 0\quad 0\\0\quad 1\quad0\\0\quad0\quad1\end{bmatrix} \end{align*}

The resulting samples, class1_sample and class2_sample, are numpy array with dimension $3 \times 20$, where each column can be pictured as a 3-dimensional vector

\begin{align*} \pmb x = \begin{pmatrix} x_1 \\ x_2 \\ x_3 \end{pmatrix} \end{align*}

So, each dataset will have the form \begin{align*} \pmb X = \begin{pmatrix} x_{1_1}\; x_{1_2} \; ... \; x_{1_{20}}\\ x_{2_1} \; x_{2_2} \; ... \; x_{2_{20}}\\ x_{3_1} \; x_{3_2} \; ... \; x_{3_{20}}\end{pmatrix} \end{align*}

Because we don't need class labels for the PCA, let us merge the samples for our 2 classes into one $3 \times 40$-dimensional array all_samples. The figure below shows all 40 3-dimensional columns as 40 points.

np.random.seed(234)

mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20).T
assert class1_sample.shape == (3,20), "The matrix has not the dimensions 3x20"

mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20).T
assert class2_sample.shape == (3,20), "The matrix has not the dimensions 3x20"

from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
plt.rcParams['legend.fontsize'] = 10   
ax.plot(class1_sample[0,:], class1_sample[1,:], class1_sample[2,:], 'o', 
        markersize=8, color='blue', alpha=0.5, label='class1')
ax.plot(class2_sample[0,:], class2_sample[1,:], class2_sample[2,:], '^', 
        markersize=8, alpha=0.5, color='red', label='class2')

plt.title('Samples for class 1 and class 2')
ax.legend(loc='upper right')

plt.show()
all_samples = np.concatenate((class1_sample, class2_sample), axis=1)

We perform the PCR in the following steps.

Computing Scatter Matrix

The $p$-dimensional mean vector is

mean_x = np.mean(all_samples[0,:])
mean_y = np.mean(all_samples[1,:])
mean_z = np.mean(all_samples[2,:])
mean_vector = np.array([[mean_x],[mean_y],[mean_z]])
print('Mean Vector:\n', mean_vector)
Mean Vector:
 [[0.77257186]
 [0.68324797]
 [0.43460518]]

The scatter matrix is computed by the following equation \begin{align*} S = \sum\limits_{k=1}^n (\pmb x_k - \pmb m)\;(\pmb x_k - \pmb m)^\intercal \end{align*}

where $\pmb m$ is the mean vector (calculated above) \begin{align*} \pmb m = \frac{1}{n} \sum\limits_{k=1}^n \; \pmb x_k \end{align*}

or alternatively scatter_matrix = np.cov(all_samples) * (40 - 1) the covariance matrix $\times$ $(n-1)$.

scatter_matrix = np.zeros((3,3))
for i in range(all_samples.shape[1]):
    scatter_matrix += (all_samples[:,i].reshape(3,1) - mean_vector).dot((all_samples[:,i].reshape(3,1) - mean_vector).T)
print('Scatter Matrix:\n', scatter_matrix)
Scatter Matrix:
 [[39.35126181  4.22467775 12.74899364]
 [ 4.22467775 50.9168567  10.18400644]
 [12.74899364 10.18400644 66.86899978]]

Computing Eigenpairs

eig_val_sc, eig_vec_sc = np.linalg.eig(scatter_matrix)
for i in range(len(eig_val_sc)):
    eigvec_sc = eig_vec_sc[:,i].reshape(1,3).T

    print('Eigenvector {}: \n{}'.format(i+1, eigvec_sc))
    print('Eigenvalue {} from scatter matrix: {}'.format(i+1, eig_val_sc[i]))
    print(40 * '-')
Eigenvector 1: 
[[0.33730459]
 [0.39443594]
 [0.85477828]]
Eigenvalue 1 from scatter matrix: 76.59928002932907
----------------------------------------
Eigenvector 2: 
[[ 0.93260098]
 [-0.01618419]
 [-0.36054609]]
Eigenvalue 2 from scatter matrix: 34.34915169429428
----------------------------------------
Eigenvector 3: 
[[ 0.12837844]
 [-0.91878091]
 [ 0.37331034]]
Eigenvalue 3 from scatter matrix: 46.18868656007504
----------------------------------------

Visualizing the eigenvectors: we plot the eigenvectors centered at the sample mean.

from matplotlib.patches import FancyArrowPatch

class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def do_3d_projection(self, renderer=None):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, self.axes.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))

        return np.min(zs)        
        
    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)

fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')

ax.plot(all_samples[0,:], all_samples[1,:], all_samples[2,:], 'o', markersize=8, color='green', alpha=0.2)
ax.plot([mean_x], [mean_y], [mean_z], 'o', markersize=10, color='red', alpha=0.5)
for v in eig_vec_sc.T:
    a = Arrow3D([mean_x, v[0]], [mean_y, v[1]], [mean_z, v[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
    ax.add_artist(a)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')

plt.title('Eigenvectors')

plt.show()

Choosing $k$ Largest Eigenvalues

We are reducing a 3-dimensional feature space to a 2-dimensional feature subspace by combining the two eigenvectors with the highest eigenvalues to construct our $p \times k$-dimensional eigenvector matrix $\mathbf{W}$.

eig_pairs = [(np.abs(eig_val_sc[i]), eig_vec_sc[:,i]) for i in range(len(eig_val_sc))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)
display('eigen pairs:', eig_pairs)
matrix_w = np.hstack((eig_pairs[0][1].reshape(3,1), eig_pairs[1][1].reshape(3,1)))
print('Matrix W:\n', matrix_w)
'eigen pairs:'
[(76.59928002932907, array([0.33730459, 0.39443594, 0.85477828])),
 (46.18868656007504, array([ 0.12837844, -0.91878091,  0.37331034])),
 (34.34915169429428, array([ 0.93260098, -0.01618419, -0.36054609]))]
Matrix W:
 [[ 0.33730459  0.12837844]
 [ 0.39443594 -0.91878091]
 [ 0.85477828  0.37331034]]

Transforming the Samples

In the last step, we use the $2 \times 3$-dimensional matrix $\mathbf{W}$ that we just computed to transform our samples onto the new subspace via the equation

\begin{align*} y = \mathbf{W}^\intercal \mathbf{x} \end{align*}
transformed = matrix_w.T.dot(all_samples)
plt.plot(transformed[0,0:20], transformed[1,0:20], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(transformed[0,20:40], transformed[1,20:40], '^', markersize=7, color='red', alpha=0.5, label='class2')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.legend()
plt.title('Transformed samples with class labels')
plt.show()

Summary

Now, that we have seen how a principal component analysis works, we can use the PCA class from the scikit-learn machine-learning library for our convenience in future applications.

from sklearn.decomposition import PCA as sklearnPCA

sklearn_pca = sklearnPCA(n_components=2)
sklearn_transf = sklearn_pca.fit_transform(all_samples.T)

plt.plot(sklearn_transf[0:20,0],sklearn_transf[0:20,1], 'o', 
         markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(sklearn_transf[20:40,0], sklearn_transf[20:40,1], '^', 
         markersize=7, color='red', alpha=0.5, label='class2')

plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.legend()
plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')
plt.show()

If we want to mimic the results produced by scikit-learn, we need change the signs of the eigenvectors produced by scikit-learn

sklearn_transf = sklearn_transf * (-1)

and subtract the mean vectors from the samples $\mathbf{X}$ to center the data at the coordinate system's origin

transformed = matrix_w.T.dot(all_samples - mean_vector)

sklearn_transf = sklearn_transf * (-1)
# sklearn.decomposition.PCA
plt.subplots(1, 2, figsize=(15,5))

plt.subplot(1, 2, 1)
plt.plot(sklearn_transf[0:20,0],sklearn_transf[0:20,1], 'o', 
         markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(sklearn_transf[20:40,0], sklearn_transf[20:40,1], '^', 
         markersize=7, color='red', alpha=0.5, label='class2')
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.legend()
plt.title('Transformed samples via sklearn.decomposition.PCA')

# user-definied PCA
plt.subplot(1, 2, 2)
transformed = matrix_w.T.dot(all_samples - mean_vector)
plt.plot(transformed[0,0:20], transformed[1,0:20], 'o', 
         markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(transformed[0,20:40], transformed[1,20:40], '^', 
         markersize=7, color='red', alpha=0.5, label='class2')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.xlabel('x_values')
plt.ylabel('y_values')
plt.legend()
plt.title('Transformed samples step by step approach')
plt.show()

In PCR, we are interested to find the directions (components) that maximize the variance in our dataset. The implicit assumption is that the response will tend to vary most in the directions of high variance of the inputs. Therefore, we project the entire set of data (without class labels) onto a different subspace via PCA and identify some important predictors.

Partial Least Squares

This technique also constructs a set of linear combinations of the inputs for regression, but unlike principal components regression it uses $\mathbf y$ (in addition to $\mathbf X$) for this construction.

Both PLS and PCR are not scale invariant, so we need to standardize input $\mathbf{x}_j$ to have mean $0$ and variance $1$.

Relation to the optimization problem

PLS seeks direction that have high variance *and* have high correlation with the response, in contrast to PCR with keys only on high variance (Stone and Brooks, 1990; Frank and Friedman, 1993).

Since it uses the response $\mathbf{y}$ to construct its directions, its solution path is a nonlinear function of $\mathbf{y}$.

In particular, the $m$th principal component direction $v_m$ solves:

\begin{equation*} \max_\alpha \text{Var}(\mathbf{X}\alpha)\\ \text{subject to } \|\alpha\| = 1, \alpha^T\mathbf{S} v_l = 0 \text{ for } l = 1,\cdots, m-1, \end{equation*}

where $\mathbf{S}$ is the sample covariance matrix of the $\mathbf{x}_j$. The condition $\alpha^T\mathbf{S} v_l= 0$ ensures that $\mathbf{z}_m = \mathbf{X}\alpha$ is uncorrelated with all the previous linear combinations $\mathbf{z}_l = \mathbf{X} v_l$.

The $m$th PLS direction $\hat\rho_m$ solves:

\begin{equation*} \max_\alpha \text{Corr}^2(\mathbf{y},\mathbf{S}\alpha)\text{Var}(\mathbf{X}\alpha)\\ \text{subject to } \|\alpha\| = 1, \alpha^T\mathbf{S}\hat\rho_l = 0 \text{ for } l=1,\cdots, m-1. \end{equation*}

Further analysis reveals that the variance aspect tends to dominate, and so PLS behaves much like ridge regression and PCR.

Partial least squares also tends to shrink the low-variance directions, but can actually inflate some of the higher variance directions. This can make PLS a little unstable, and cause it to have slightly higher prediction error compared to ridge regression.

PLS, PCR and ridge regression tend to behave similarly. Ridge regression may be preferred because it shrinks smoothly, rather than in discrete steps. Lasso falls somewhere between ridge regression and best subset regression, and enjoys some of the properties of each.

Example: Prostate Cancer

The data for this example come from a study by Stamey et al. (1989) that examined the correlation between the level of prostate specific antigen (PSA) and a number of clinical measures, in 97 men who were about to receive a radical prostatectomy. The goal is to predict the log of PSA (lpsa) from a number of measurements.

The data has variables:

  • lpsa: log prostate specific antigen

  • lcavol: log cancer volume

  • lweight: log prostate weight

  • age: age

  • lbph: log of benign prostatic hyperplasia amount

  • svi: seminal vesicle invasion

  • lcp: log of capsular penetration

  • gleason: Gleason score

  • pgg45: percent of Gleason scores 4 or 5

We split data to training and test data. We show the head of the data, the correlation matrix of all variables, and the pairwise scatterplot.

ORANGE, BLUE, PURPLE = '#E69F00', '#56B4E9', '#A020F0'
GRAY1, GRAY4 = '#231F20', '#646369'
target = 'lpsa'
features = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
df = pd.read_csv(data_path + "prostate_cancer.txt")
display(df.head())
sns.pairplot(df, vars=[target]+features, kind="scatter", height=1.1)

# split data to training/test np.arrays
is_train = df.train == 'T'
X, y = df[features].values, df[target].values
X_train, y_train = X[is_train], y[is_train]
X_test, y_test = X[~is_train], y[~is_train]
N_test = len(y_test)
display(df[is_train][features].corr())
id lcavol lweight age lbph svi lcp gleason pgg45 lpsa train
0 1 -0.579818 2.769459 50 -1.386294 0 -1.386294 6 0 -0.430783 T
1 2 -0.994252 3.319626 58 -1.386294 0 -1.386294 6 0 -0.162519 T
2 3 -0.510826 2.691243 74 -1.386294 0 -1.386294 7 20 -0.162519 T
3 4 -1.203973 3.282789 58 -1.386294 0 -1.386294 6 0 -0.162519 T
4 5 0.751416 3.432373 62 -1.386294 0 -1.386294 6 0 0.371564 T
lcavol lweight age lbph svi lcp gleason pgg45
lcavol 1.000000 0.300232 0.286324 0.063168 0.592949 0.692043 0.426414 0.483161
lweight 0.300232 1.000000 0.316723 0.437042 0.181054 0.156829 0.023558 0.074166
age 0.286324 0.316723 1.000000 0.287346 0.128902 0.172951 0.365915 0.275806
lbph 0.063168 0.437042 0.287346 1.000000 -0.139147 -0.088535 0.032992 -0.030404
svi 0.592949 0.181054 0.128902 -0.139147 1.000000 0.671240 0.306875 0.481358
lcp 0.692043 0.156829 0.172951 -0.088535 0.671240 1.000000 0.476437 0.662533
gleason 0.426414 0.023558 0.365915 0.032992 0.306875 0.476437 1.000000 0.757056
pgg45 0.483161 0.074166 0.275806 -0.030404 0.481358 0.662533 0.757056 1.000000

In the scatterplot matrix of the prostate cancer data, the first row shows the response against each of the predictors in turn. Two of the predictors, svi and gleason, are categorical. Some correlations with lpsa are evident, but a good predictive model is difficult to construct by eye.

The baseline prediction is using the mean of $y$ in the training set.

from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
null_model = DummyRegressor().fit(X_train, y_train)
base_error_rate = mean_squared_error(y_test, null_model.predict(X_test))
print(f'Baseline Test Error: {base_error_rate:.3f}')
Baseline Test Error: 1.057

Least Square

from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
scaler = StandardScaler().fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

ls = sm.OLS(y_train, sm.add_constant(X_train)).fit()
ls_params = ls.params
display(ls.summary())

y_hat = ls.predict(sm.add_constant(X_test))
ls_error_rate = mean_squared_error(y_test, y_hat)
ls_std_error = np.std((y_test - y_hat)**2, ddof=1)/np.sqrt(N_test)
print(f'Least Squares Test Error: {ls_error_rate:.3f}')
print(f'               Std Error: {ls_std_error:.3f}')
OLS Regression Results
Dep. Variable: y R-squared: 0.694
Model: OLS Adj. R-squared: 0.652
Method: Least Squares F-statistic: 16.47
Date: Tue, 08 Nov 2022 Prob (F-statistic): 2.04e-12
Time: 17:26:40 Log-Likelihood: -67.505
No. Observations: 67 AIC: 153.0
Df Residuals: 58 BIC: 172.9
Df Model: 8
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 2.4649 0.089 27.598 0.000 2.286 2.644
x1 0.6760 0.126 5.366 0.000 0.424 0.928
x2 0.2617 0.095 2.751 0.008 0.071 0.452
x3 -0.1407 0.101 -1.396 0.168 -0.343 0.061
x4 0.2091 0.102 2.056 0.044 0.006 0.413
x5 0.3036 0.123 2.469 0.017 0.057 0.550
x6 -0.2870 0.154 -1.867 0.067 -0.595 0.021
x7 -0.0212 0.144 -0.147 0.884 -0.310 0.268
x8 0.2656 0.153 1.738 0.088 -0.040 0.571
Omnibus: 0.825 Durbin-Watson: 1.690
Prob(Omnibus): 0.662 Jarque-Bera (JB): 0.389
Skew: -0.164 Prob(JB): 0.823
Kurtosis: 3.178 Cond. No. 4.47


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Least Squares Test Error: 0.521
               Std Error: 0.179

Next, we consider aforementioned different models and exhaustively search over specified parameter values for the model. The least complex model within one standard error of the best is chosen, indicated by the vertical broken lines. The process is as follows:

  1. For each parameter value, perform K-fold cross-validation.
  2. Calculate the cross-validation mean MSE and its standard error.
  3. Find the best model, its cross-validation mean MSE and its standard error.
  4. Find the least complex model with cross-validation mean MSE within one standard error of the best.

The horizontal broken indicates the one standard error of the best mean error. Basically, the least complex model below the line is chosen.

The grid search object has 10-fold cross validation. Three functions, cross_validation, assess_selected_model and plot_cv_results, are built for our testing.

from sklearn.model_selection import GridSearchCV, KFold

def cross_validation(estimator, param_grid):
    # prepare grid search object with 10-fold cross validation it uses
    # neg_mean_squared_error due to the higher is the better approach
    K=10
    grid_search = GridSearchCV(
        estimator, param_grid, cv=KFold(n_splits=K, shuffle=True, random_state=69438),
        scoring='neg_mean_squared_error', return_train_score=True
    ).fit(X_train, y_train)

    # convert neg_mean_squared_error to mean squared error and reshape results
    # to the form where a row correspons to one paramenter's 10 cross
    # validation mse values
    cv_erros = -np.vstack([grid_search.cv_results_[f'split{i}_test_score']
                           for i in range(K)]).T
    # calculate mean squared error for parameters and their standard error
    cv_mean_errors = np.mean(cv_erros, axis=1)
    cv_std_errors = np.std(cv_erros, ddof=1, axis=1)/np.sqrt(K)

    # The least complex model within one standard error of the best is chosen
    best_index = np.argmin(cv_mean_errors)
    best_err = cv_mean_errors[best_index]
    best_std_err = cv_std_errors[best_index]
    model_index = np.argmax(cv_mean_errors < (best_err+best_std_err))
    return cv_mean_errors, cv_std_errors, best_err, best_std_err, model_index

# calculates the estimator's test mean squared error and its standard error
def assess_selected_model(estimator):
    y_hat = np.squeeze(estimator.predict(X_test))
    error_rate = mean_squared_error(y_test, y_hat)
    std_error = np.std((y_test - y_hat)**2, ddof=1)/np.sqrt(N_test)
    return error_rate, std_error

def plot_cv_results(title, x_label, test_error, std_error, complexity_vals,
                    cv_mean_errors, cv_std_errors, best_err, best_std_err,
                    selected_model_index):
    fig, ax = plt.subplots(figsize=(4, 3), dpi=110)
    ax.plot(complexity_vals, cv_mean_errors, c=ORANGE, linewidth=0.8)
    ax.errorbar(
        complexity_vals, cv_mean_errors, color=ORANGE, linestyle='None',
        marker='o', elinewidth=0.8, markersize=3, yerr=cv_std_errors,
        ecolor=BLUE, capsize=3)
    ax.axhline(y=best_err+best_std_err, c=PURPLE, linewidth=0.8,
               linestyle='--')
    ax.axvline(x=complexity_vals[selected_model_index], c=PURPLE,
               linewidth=0.8, linestyle='--')
    for i in ax.get_yticklabels() + ax.get_xticklabels():
        i.set_fontsize(6)
    ax.text(ax.get_xlim()[0], ax.get_ylim()[1]+(ax.get_ylim()[1]-ax.get_ylim()[0])*0.05, 
            title, color=GRAY4, fontsize=9)
    ax.set_xlabel(x_label, color=GRAY4, fontsize=8)
    ax.set_ylabel('CV Error', color=GRAY4, fontsize=8)
    parms = {'color': GRAY1, 'fontsize': 7,
             'bbox': {'facecolor': 'white', 'pad': 0, 'edgecolor': 'none'}}
    limx = ax.get_xlim()
    limy = ax.get_ylim()
    text_x = limx[1] - (limx[1]-limx[0])*0.02
    text_y1 = limy[1] - (limy[1]-limy[0])*0.05
    text_y2 = limy[1] - (limy[1]-limy[0])*0.12
    ax.text(
        text_x, text_y1, f'Test Error:  {test_error:.3f}', **parms, ha='right')
    ax.text(text_x, text_y2, f' Std Error:  {std_error:.3f}', **parms, ha='right')

Ridge Regression

from sklearn.linear_model import Ridge
degress_of_freedom = list(range(1,9))
# these alpha parameters correspond to 0-8 degrees of freedom
parameters = {'alpha': [436, 165, 82, 44, 27, 12, 4, 1e-05]}

rd_errs, rd_std_errs, best_err, best_std_err, selected_index = \
    cross_validation(Ridge(), parameters)

rd = Ridge(alpha=parameters['alpha'][selected_index]).fit(X_train, y_train)
rd_params = np.hstack((rd.intercept_, rd.coef_))
rd_error_rate, rd_std_error = assess_selected_model(rd)
plot_cv_results(
    'Ridge Regression', 'Degrees of Freedom', rd_error_rate, rd_std_error,
    degress_of_freedom, rd_errs, rd_std_errs, best_err, best_std_err,
    selected_index)

Lasso Regression

from sklearn.linear_model import Lasso

shrinkage = [0.102, 0.254, 0.384, 0.504, 0.612, 0.756, 0.883, 1]
# these alpha parameters correspond to above shrinkage values
parameters = {
    'alpha': [0.680, 0.380, 0.209, 0.100, 0.044, 0.027, 0.012, 0.001]}
lo_errs, lo_std_errs, best_err, best_std_err, selected_index = \
    cross_validation(Lasso(), parameters)

lo = Lasso(alpha=parameters['alpha'][selected_index]).fit(X_train, y_train)
lo_params = np.hstack((lo.intercept_, lo.coef_))
lo_error_rate, lo_std_error = assess_selected_model(lo)
plot_cv_results(
    'Lasso', 'Shrinkage Factor s', lo_error_rate, lo_std_error, shrinkage,
    lo_errs, lo_std_errs, best_err, best_std_err, selected_index)

Best Subset Regression

from itertools import combinations

class BestSubsetRegression(LinearRegression):
    def __init__(self, subset_size=1):
        LinearRegression.__init__(self)
        self.subset_size = subset_size

    def fit(self, X, y, sample_weight=None):
        best_combination, best_mse = None, None
        best_intercept_, best_coef_ = None, None
        # try all combinations of subset_size
        for combination in combinations(range(X.shape[1]), self.subset_size):
            X_tmp = X[:, combination]
            LinearRegression.fit(self, X_tmp, y)
            mse = mean_squared_error(y, self.predict(X_tmp))
            if best_mse is None or best_mse > mse:
                # select the best combination
                best_combination, best_mse = combination, mse
                best_intercept_, best_coef_ = self.intercept_, self.coef_
        LinearRegression.fit(self, X, y)
        # replace intercept and parameters with the found, set other zero
        self.intercept_ = best_intercept_
        self.coef_ = np.zeros(shape=self.coef_.shape)
        for i, idx in enumerate(best_combination):
            self.coef_[idx] = best_coef_[i]
        return self
    
# it is not possible to use Best Subset with 0 size, so run CV with 1-8 k
parameters = {'subset_size': list(range(1, 9))}
# run CV to chose best parameter
bs_errs, bs_std_errs, best_err, best_std_err, selected_index = \
    cross_validation(BestSubsetRegression(), parameters)

bs = BestSubsetRegression(subset_size=parameters['subset_size'][selected_index]).fit(X_train, y_train)
bs_params = np.hstack((bs.intercept_, bs.coef_))
bs_error_rate, bs_std_error = assess_selected_model(bs)
subset_sizes = list(range(0, 9))
plot_cv_results(
    'All Subsets', 'Subset Size', bs_error_rate, bs_std_error, subset_sizes,
    np.hstack((lo_errs[0], bs_errs)), np.hstack((lo_std_errs[0], bs_std_errs)),
    best_err, best_std_err, selected_index+1)

Principal Components Regression

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# it is not possible to use PCA with 0 directions, so run CV with 1-8 directions
parameters = {'pca__n_components': list(range(1, 9))}
model = Pipeline([('pca', PCA()),
                  ('regression', LinearRegression())])
pc_errs, pc_std_errs, best_err, best_std_err, selected_index = \
    cross_validation(model, parameters)

# refit the model to the training set with n-directions from CV
pca = PCA(n_components=parameters['pca__n_components'][selected_index])
lr = LinearRegression()
model = Pipeline([('pca', pca),
                  ('regression', lr)]).fit(X_train, y_train)
# X_train are not centered, so calculation is little bit different
pc_params = np.zeros(8)
intercept = lr.intercept_
for i in range(selected_index+1):
    pc_params += lr.coef_[i]*pca.components_[i, :]
    intercept -= lr.coef_[i]*(pca.mean_ @ pca.components_[i, :])
pc_params = np.hstack(([intercept], pc_params))
pc_error_rate, pc_std_error = assess_selected_model(model)
n_directions = list(range(0, 9))
plot_cv_results(
    'Principal Components Regression', 'Number of Directions', pc_error_rate,
    pc_std_error, n_directions, np.hstack((lo_errs[0], pc_errs)),
    np.hstack((lo_std_errs[0], pc_std_errs)), best_err, best_std_err,
    selected_index+1)

Partial Least Squares

from sklearn.cross_decomposition import PLSRegression

parameters = {'n_components': list(range(1, 9))}
ps_errs, ps_std_errs, best_err, best_std_err, selected_index = \
    cross_validation(PLSRegression(scale=False), parameters)

ps = PLSRegression(n_components=selected_index, scale=False).fit(X_train, y_train)
ps_params = np.hstack((ps.y_mean_, np.squeeze(ps.coef_)))
ps_error_rate, ps_std_error = assess_selected_model(ps)
n_directions = list(range(0, 9))
plot_cv_results(
    'Partial Least Squares', 'Number of Directions', ps_error_rate,
    ps_std_error, n_directions, np.hstack((lo_errs[0], ps_errs)),
    np.hstack((lo_std_errs[0], ps_std_errs)), best_err, best_std_err,
    selected_index+1)
result = zip(['Intercept'] + features,
             ls_params, bs_params, rd_params, lo_params, pc_params, ps_params)
print('      Term       LS   Best Subset    Ridge    Lasso      PCR      PLS')
print('---------------------------------------------------------------------')
for term, ls, bs, rd, lo, pc, ps in result:
    print(f'{term:>10}{ls:>9.3f}{bs:>14.3f}{rd:>9.3f}{lo:>9.3f}{pc:>9.3f}'
          f'{ps:>9.3f}')
print('---------------------------------------------------------------------')
print(f'Test Error{ls_error_rate:>9.3f}{bs_error_rate:>14.3f}'
      f'{rd_error_rate:>9.3f}{lo_error_rate:>9.3f}{pc_error_rate:>9.3f}'
      f'{ps_error_rate:>9.3f}')
print(f' Std Error{ls_std_error:>9.3f}{bs_std_error:>14.3f}'
      f'{rd_std_error:>9.3f}{lo_std_error:>9.3f}{pc_std_error:>9.3f}'
      f'{ps_std_error:>9.3f}')
      Term       LS   Best Subset    Ridge    Lasso      PCR      PLS
---------------------------------------------------------------------
 Intercept    2.465         2.477    2.464    2.468    2.497    2.452
    lcavol    0.676         0.736    0.405    0.536    0.548    0.417
   lweight    0.262         0.315    0.234    0.187    0.287    0.343
       age   -0.141         0.000   -0.042    0.000   -0.154   -0.026
      lbph    0.209         0.000    0.158    0.000    0.213    0.219
       svi    0.304         0.000    0.221    0.085    0.313    0.242
       lcp   -0.287         0.000    0.010    0.000   -0.062    0.078
   gleason   -0.021         0.000    0.042    0.000    0.226    0.011
     pgg45    0.266         0.000    0.128    0.006   -0.048    0.083
---------------------------------------------------------------------
Test Error    0.521         0.492    0.492    0.479    0.449    0.527
 Std Error    0.179         0.143    0.164    0.164    0.106    0.150

Forward Stagewise Regression

The plot shows the profiles of estimated coefficients from linear regression. The left panel shows the results from the lasso, for different values of the bound parameter $t = \sum_k |\alpha_k|$. The right panel shows the results of the stagewise linear regression, using $M = 220$ consecutive steps of size $\epsilon = .01$.

alpha = np.linspace(0.001, 0.93, 100)
b_hat = np.vstack([Lasso(alpha=a).fit(X_train, y_train).coef_
                   for a in reversed(alpha)])
b_pow = np.sum(np.abs(b_hat), axis=1)

# Forward Stagewise Linear Regression
scaler = StandardScaler(with_std=False)
y_train_scaled = np.squeeze(scaler.fit_transform(np.atleast_2d(y_train).T))
eps, K, M = 0.01, X_train.shape[1], 220
a_hat = None
a_hat_current = np.zeros(shape=(K))
for m in range(M):
    resid = y_train_scaled - X_train @ a_hat_current
    best_sum_squares, b_star, k_star = None, None, None
    for k in range(K):
        X_train_k = X_train[:, k:k+1]
        lstsq_result = np.linalg.lstsq(X_train_k, resid)
        b, sum_squares = lstsq_result[0], lstsq_result[1][0]
        if best_sum_squares is None or best_sum_squares > sum_squares:
            best_sum_squares, b_star, k_star = sum_squares, b, k
    a_hat_current[k_star] += eps * np.sign(b_star)[0]
    if a_hat is None:
        a_hat = np.copy(a_hat_current)
    else:
        a_hat = np.vstack((a_hat, a_hat_current))
        
def plot_coef_profiles(ax, title, color, coef, x_values, x_title):
    for i in range(coef.shape[1]):
        ax.plot(x_values, coef[:, i], color=color, linewidth=0.7)
    ax2 = ax.twinx()
    ax2.set_ylim(ax.get_ylim())
    plt.setp(ax2, yticks=coef[-1], yticklabels=features)
    for i in ax.get_yticklabels() + ax.get_xticklabels() + \
             ax2.get_yticklabels():
        i.set_fontsize(6)
    ax.set_xlabel(x_title, color=GRAY4, fontsize=7)
    ax.set_ylabel('Coefficients', color=GRAY4, fontsize=7)
    ax.text(ax.get_xlim()[0], ax.get_ylim()[1]+(ax.get_ylim()[1]-ax.get_ylim()[0])*.02, title, color=GRAY4,
            fontsize=7)

fig, axarr = plt.subplots(1, 2, figsize=(5, 3), dpi=150)
plt.subplots_adjust(wspace=0.65)
# plt.rcParams['text.usetex'] = True
plot_coef_profiles(axarr[0], 'Lasso', 'red', b_hat, b_pow, r'$||\beta||_1$')
plot_coef_profiles(axarr[1], 'Forward Stagewise', 'blue', a_hat, range(M),'Iteration')

Bayesian Regression

We assume that the target variable $y$ is given by $f(\mathbf{x}, \mathbf{w})$ with additive Gaussian noise with precision $\beta$, which is $y = f(\mathbf{x}, \mathbf{w}) + \epsilon$. The approach defines a generative model

\begin{align*} p(y|\mathbf{x},\mathbf{w}, \beta) &= \mathcal{N}(y|f(\mathbf{x},\mathbf{w}), \beta^{-1}) \label{prob_dist_target} \end{align*}

Maximal Likelihood

The log likelihood function is

\begin{align} \ln p(\mathbf{y}|\mathbf{X},\mathbf{w}, \beta) & = \frac{N}{2} \ln \beta - \frac{N}{2} \ln(2\pi) - \beta \textrm{E}_{\mathcal{D}}(\mathbf{w})\nonumber \end{align}

where \begin{align*} \textrm{E}_{\mathcal{D}}(\mathbf{w}) = \sum_{i=1}^{n}\left( y_i - \mathbf{w}^\intercal \phi(\mathbf{x}_{i}) \right)^2 = \text{RSS}(\mathbf{w}) \end{align*}

ML = LS

So, we have \begin{align*} \max ~\ln p(\mathbf{y}|\mathbf{X},\mathbf{w}, \beta) \text{ is equivalent to } \min~\text{RSS}(\mathbf{w}) \end{align*}

Proof

Note that

\begin{align} p(y|\mathbf{x},\mathbf{w}, \beta) &= \mathcal{N}(y|f(\mathbf{x},\mathbf{w}), \beta^{-1}) = \mathcal{N}(y|\mathbf{w}^\intercal \phi(\mathbf{x}), \beta^{-1}) \nonumber \end{align}

The likelihood function is

\begin{align} p(\mathbf{y}|\mathbf{X},\mathbf{w}, \beta) &= \prod_{i=1}^{n} \mathcal{N}(y_i|\mathbf{w}^\intercal \phi(\mathbf{x}_{i}), \beta^{-1}) \nonumber \\ &= \prod_{i=1}^{n} \frac{\beta^{1/2}}{(2\pi)^{1/2}} \exp\left( \frac{\beta}{2} (y_i - \mathbf{w}^\intercal \phi(\mathbf{x}_{i}))^2 \right) \label{likelihood} \end{align}

The log likelihood function is

\begin{align} \ln p(\mathbf{t}|\mathbf{X},\mathbf{w}, \beta) &= \frac{n}{2} \ln \beta - \frac{n}{2} \ln(2\pi) - \frac{\beta}{2} \sum_{i=1}^{n}\left( y_i - \mathbf{w}^\intercal \phi(\mathbf{x}_{i}) \right)^2 \nonumber \\ & \equiv \frac{n}{2} \ln \beta - \frac{n}{2} \ln(2\pi) - \beta E_{\mathcal{D}}(\mathbf{w})\nonumber \end{align}


Regularization

We can view regularization as Bayesian estimates. Suppose the noise precision parameter $\beta$ is known. The likelihood function is. Note that the inputs $\mathbf{x}$ will always appear in the set of conditioning variables. We may drop the explicit $\mathbf{x}$ from future expressions for simplicity.

\begin{align} p(\mathbf{y}|\mathbf{X},\mathbf{w}) \equiv p(\mathbf{y}|\mathbf{w}) = \prod_{i=1}^{n} \mathcal{N}\left(y_i|\mathbf{w}^\intercal \phi(\mathbf{x}_i), \beta^{-1} \right) \nonumber \end{align}

The corresponding conjugate prior $p(\mathbf{w}| \alpha) = \mathcal{N}(\mathbf{w} | \mathbf{0}, \alpha^{-1}\mathbf{I})$. The posterior distribution is

\begin{align} p(\mathbf{w} | \mathbf{y}) & \propto p(\mathbf{y}|\mathbf{w}, \beta) p(\mathbf{w}) \nonumber \end{align}

The log of the posterior distribution takes the form

\begin{align*} \ln p(\mathbf{w} | \mathbf{y}, \mathbf{X}) &= \ln p(\mathbf{y}|\mathbf{w}, \beta) + \ln p(\mathbf{w})\\ &= \sum_{i=1}^{n} \ln \mathcal{N}\left(y_i|\mathbf{w}^\intercal \phi(\mathbf{x}_i), \beta^{-1} \right) + \ln \mathcal{N}(\mathbf{w} | \mathbf{0}, \alpha^{-1}\mathbf{I}) \\ &= -\frac{\beta}{2} \sum_{i=1}^{n} \left( y_i - \mathbf{w}^\intercal \phi(\mathbf{x}_i) \right)^2 - \frac{\alpha}{2} \mathbf{w}^\intercal \mathbf{w} + \text{const.} \nonumber \end{align*}

Posterior Distribution

Moreover, we have $p(\mathbf{w} | \mathbf{y}) = \mathcal{N}(\mathbf{w} | \mathbf{m}_n, \mathbf{S}_n)$.

Technical Details

We consider general prior $p(\mathbf{w}) = \mathcal{N}(\mathbf{w} | \mathbf{m}_0, \mathbf{S}_0)$ and the posterior

\begin{align*} p(\mathbf{w} | \mathbf{y}) & \propto \left[ \prod_{i=1}^{n} \mathcal{N}\left(y_i|\mathbf{w}^\intercal \phi(\mathbf{x}_i), \beta^{-1} \right) \right] \mathcal{N}(\mathbf{w} | \mathbf{m}_0, \mathbf{S}_0) \\ & \propto \exp \left( -\frac{\beta}{2} (\mathbf{y} - \Phi \mathbf{w})^\intercal (\mathbf{y} - \Phi \mathbf{w})\right) \exp \left( -\frac{1}{2} (\mathbf{w} - \mathbf{m}_0)^\intercal \mathbf{S}^{-1}_0 (\mathbf{w} - \mathbf{m}_0)\right) \\ & = \exp \left[ -\frac{1}{2} \left(\mathbf{w}^\intercal (\mathbf{S}^{-1}_0 + \beta\Phi^\intercal \Phi ) \mathbf{w} - (\mathbf{S}^{-1}_0 \mathbf{m}_0 + \beta \Phi^\intercal \mathbf{y})^\intercal \mathbf{w} - \mathbf{w}^\intercal (\mathbf{S}^{-1}_0 \mathbf{m}_0 + \beta \Phi^\intercal \mathbf{y})\right) \right] \\ & \hspace{2cm} \exp \left[ -\frac{1}{2} \left( \beta \mathbf{y}^\intercal \mathbf{y} + \mathbf{m}_0^\intercal \mathbf{S}_0^{-1} \mathbf{m}_0\right) \right] \\ & = \exp \left( -\frac{1}{2} (\mathbf{w} - \mathbf{m}_n)^\intercal \mathbf{S}^{-1}_n (\mathbf{w} - \mathbf{m}_n)\right) \exp \left( -\frac{1}{2} (\beta \mathbf{y}^\intercal \mathbf{y} + \mathbf{m}_0^\intercal \mathbf{S}_0^{-1} \mathbf{m}_0 - \mathbf{m}_n^\intercal \mathbf{S}_n^{-1} \mathbf{m}_n)\right) \end{align*}

where

\begin{align*} \mathbf{m}_n = \mathbf{S}_n \left( \mathbf{S}^{-1}_0 \mathbf{m}_0 + \beta \mathbf{\Phi}^\intercal \mathbf{y} \right) \text{ and } \mathbf{S}^{-1}_n = \mathbf{S}^{-1}_0 + \beta \mathbf{\Phi}^\intercal \mathbf{\Phi} \end{align*}

The first exponential corresponds to the posterior, unnormalized Gaussian distribution over $\mathbf{w}$, while the second exponential is independent of $\mathbf{w}$ and hence can be absorbed into the normalization factor. For prior $p(\mathbf{w}| \alpha) = \mathcal{N}(\mathbf{w} | \mathbf{0}, \alpha^{-1}\mathbf{I})$,

\begin{align} \mathbf{m}_n = \beta \mathbf{S}_n\mathbf{\Phi}^\intercal \mathbf{y} \text{ and } \mathbf{S}^{-1}_n = \alpha\mathbf{I} + \beta \mathbf{\Phi}^\intercal \mathbf{\Phi} \nonumber \end{align}


Bayesian Learning

We illustrate Bayesian learning and the sequential update of a posterior distribution on a simple example involving straight-line fitting.

Consider a simple linear model of the form $y(x, \mathbf{w}) = -0.3 + 0.5 x$. We demonstrate how $p(\mathbf{w} | \mathbf{y}, \mathbf{X})$ changes through learning

  • Figure "prior/posterior" shows the initial belief of $\mathbf{w}$ and how it involves after knowing more data points. Based on Bayesian model, prior/posterior distribution is a normal distribution

    multivariate_normal.pdf(w, mean=model.w_mean, cov=model.w_cov)

    and the mean is approaching to the true value $(-.3, .5)$ from $(0,0)$.

  • The likelihood function provides a soft constraint that the line must pass close to the data point, where close is determined by the noise precision $\beta$. We demonstrate $p(y_0|x_0, \mathbf{w})$ for one point (at end position). Standardized pdf

    contour(w0, w1, multivariate_normal.pdf((-t_train[end-1: end] + w @ phi_train[end-1: end].T)*model.beta))

    enumerates all the values of $\mathbf{w}$ and shows the probability that the response $y_0$ occurs. The graph always highlights a straight stripe near the true value $(-.3, .5)$ because any $\mathbf{w}$ on the line $y_0 = w_0 + w_1 x_0$ has large likelihood function value.

  • The lines in Figure "data space" are based on randomly generated $\mathbf{w}$ following current belief.

def linear(x):
    return -0.3 + 0.5 * x

w0, w1 = np.meshgrid(
    np.linspace(-1, 1, 100),
    np.linspace(-1, 1, 100))
w = np.array([w0, w1]).transpose(1, 2, 0)

x_train, t_train = create_data(linear, 20, 0.1, [-1, 1])
x_test = np.linspace(-1, 1, 100)

phi_train = Polynomial(1).dm(x_train)
phi_test  = Polynomial(1).dm(x_test)

model = Bayesian(alpha=2., beta=25.)

for beg, end in [[0, 0], [0, 1], [1, 2], [2, 3], [3, 20]]:
    model.fit(phi_train[beg: end], t_train[beg: end])
    
    fig, axes = plt.subplots(1,3,figsize=(16, 4))
    if end > 0:
        axes[0].scatter(-0.3, 0.5, s=200, marker="x")
        # Use standardization.
        axes[0].contour(w0, w1, 
                        multivariate_normal.pdf((-t_train[end-1: end] + w @ phi_train[end-1: end].T)*model.beta))
    axes[0].set_xlabel("$w_0$")
    axes[0].set_ylabel("$w_1$")
    axes[0].set_title("likelihood")    
    
    axes[1].scatter(-0.3, 0.5, s=200, marker="x")
    axes[1].contour(w0, w1, multivariate_normal.pdf(w, mean=model.w_mean, cov=model.w_cov))
    axes[1].set_xlabel("$w_0$")
    axes[1].set_ylabel("$w_1$")
    axes[1].yaxis.set_label_coords(-.12,.5)
    axes[1].set_title("prior/posterior")
    
    axes[2].scatter(x_train[:end], t_train[:end], s=20, facecolor="none", edgecolor="steelblue", lw=1)
    # use posterior dist to generate w, and then w is used for prediction
    axes[2].plot(x_test, 
                 phi_test @ np.random.multivariate_normal(model.w_mean, model.w_cov, size=6).T, c="orange")
    axes[2].set_xlim(-1, 1)
    axes[2].set_ylim(-1, 1)
    axes[2].set_xlabel("$x$")
    axes[2].set_ylabel("$y$") 
    axes[2].yaxis.set_label_coords(-.12,.5)
    axes[2].set_title(r"data space $n={}$".format(end))
    plt.show()

Bayesian Prediction

We use polynomial curve fitting as an illustration of the predictive distribution for Bayesian linear regression models. It provides a confidence region on predictions

x_train, t_train = create_data(sinusoidal, 10, 0.25)
x_test = np.linspace(0, 1, 100)
t_test = sinusoidal(x_test)

phi_train = Gaussian(np.linspace(0, 1, 9), 0.1).dm(x_train)
phi_test  = Gaussian(np.linspace(0, 1, 9), 0.1).dm(x_test)

t, t_std = Bayesian(alpha=2e-3, beta=2).fit(phi_train, t_train).predict_dist(phi_test)

plt.scatter(x_train, t_train, facecolor="none", edgecolor="b", s=50, label="training data")
plt.plot(x_test, t_test, c="g", label="$\sin(2\pi x)$")
plt.plot(x_test, t, c="r", label="mean")
plt.fill_between(x_test, t_test - t_std, t_test + t_std, color="pink", label="std.", alpha=0.5)
plt.xlim(-0.1, 1.1)
plt.ylim(-1.5, 1.5)
plt.annotate("n=9", xy=(0.8, 1))
plt.legend(bbox_to_anchor=(1.05, 1.), loc=2, borderaxespad=0.)
plt.show()

To recap the example and dive into the details, we show the predictive distribution for Bayesian linear regression models by sequentially learning data points.

x_train, t_train = create_data(sinusoidal, 25, 0.25)
x_test = np.linspace(0, 1, 100)
t_test = sinusoidal(x_test)

phi_train = Gaussian(np.linspace(0, 1, 9), 0.1).dm(x_train)
phi_test  = Gaussian(np.linspace(0, 1, 9), 0.1).dm(x_test)

model = Bayesian(alpha=2e-3, beta=2)

plt.figure(figsize=(18, 8))
for i, [beg, end] in enumerate([[0, 1], [1, 2], [2, 4], [4, 8], [8, 25]]):
    t, t_std = model.fit(phi_train[beg: end], t_train[beg: end]).predict_dist(phi_test)
    
    plt.subplot(2,3,i+1)
    
    plt.scatter(x_train[:end], t_train[:end], s=100, facecolor="none", edgecolor="steelblue", lw=2)
    plt.plot(x_test, t_test, label="$\sin(2\pi x)$")
    plt.plot(x_test, t, label="fitting")
    plt.fill_between(x_test, t - t_std, t + t_std, color="orange", alpha=0.5)
    plt.xlim(-0.02, 1)
    plt.ylim(-2, 2)
    plt.title(r"data size $n={}$".format(end))
    plt.legend()
plt.show()

Evidence Approximation

The predictive distribution is \begin{align} p(t|\mathbf{y}, \alpha, \beta) = \mathcal{N} \left( t | \mathbf{m}^\intercal_{n} \phi(\mathbf{x}), \sigma^2_n(\mathbf{x}) \right) \nonumber \end{align}

The key is to obtain hyperparameters $\alpha, \beta$ as well as $\mathbf{m}_n$.

  1. Initialization: $k = 0$, $\alpha^0 = \alpha_0$ and $\beta^0 = \beta_0$

  2. Find eigenvalues $\lambda_i$, $i=0, \ldots, M-1$ such that \begin{align} \left(\beta \Phi^\intercal \Phi \right) \mathbf{u}_i = \lambda_i \mathbf{u}_i \nonumber \end{align}

  3. Let
    \begin{align} \gamma^k = \sum_{i=0}^{M-1} \frac{\lambda_i}{\alpha^k+\lambda_i},~ \alpha^{k+1} = \frac{\gamma^k}{\mathbf{w}^\intercal_{mean} \mathbf{w}_{mean}} \text{ and } \frac{1}{\beta^{k+1}} = \frac{1}{N - \gamma} \sum_{i=1}^{N} \left( y_i - \mathbf{w}^\intercal_{mean} \phi(x_i) \right)^2 \nonumber \end{align} where $\mathbf{w}_{mean} = \mathbf{m}_n = \beta \mathbf{S}_n\mathbf{\Phi}^\intercal \mathbf{y}$.

  4. If $|\alpha^{k+1} - \alpha^{k}| + |\beta^{k+1} - \beta^{k}| <$ threshold, then return $\alpha, \beta$, else $k = k+1$ and go to step 2.

For the polynomial regression model with degree from 0 to 8, we find the optimal value of the evidence function. The empirical bayes approach is applied and the evidence favours the model with degree $= 3$.

def cubic(x):
    return x * (x - 5) * (x + 5)

x_train, t_train = create_data(cubic, 30, 10, [-5, 5])
x_test = np.linspace(-5, 5, 100)
t_test = cubic(x_test)

phi_train = [Polynomial(i).dm(x_train) for i in range(8)]

models = [Empirical(alpha=100., beta=100.).fit(phi, t_train, max_iter=100) for phi in phi_train]
evidences = [model.log_evidence(phi, t_train) for model, phi in zip(models, phi_train)]

opt_degree = int(np.nanargmax(evidences))

phi_test = Polynomial(opt_degree).dm(x_test)
t, t_std = models[opt_degree].predict_dist(phi_test)

plt.scatter(x_train, t_train, s=50, facecolor="none", edgecolor="steelblue", label="observation")
plt.plot(x_test, t_test, label="x(x-5)(x+5)")
plt.plot(x_test, t, label="prediction")
plt.fill_between(x_test, t - t_std, t + t_std, alpha=0.5, label="std", color="orange")
plt.legend()
plt.show()

plt.plot(evidences)
plt.title("Model evidence")
plt.xlabel("degree")
plt.ylabel("log evidence")
plt.show()